Plotting the Correlation between Air Quality and Weather


In [1]:
# If done right, this program should
# Shoutout to my bois at StackOverflow - you da real MVPs

# Shoutout to my bois over at StackOverflow - couldn't've done it without you

import pandas as pd
import numpy as np
from bokeh.plotting import figure
from bokeh.io import show
from bokeh.models import HoverTool, Label
import scipy.stats


weatherfile = input("Which weather file would you like to use? ")
df = pd.read_csv(weatherfile)

temp = df.as_matrix(columns=df.columns[3:4])
temp = temp.ravel()

humidity = df.as_matrix(columns=df.columns[4:5])
humidity = humidity.ravel()

pressure = df.as_matrix(columns=df.columns[5:])
pressure = pressure.ravel()

unix_timeweather = df.as_matrix(columns=df.columns[2:3])

i = 0

w_used = eval(raw_input("Which data set do you want? temp, humidity, or pressure? "))

######################################################################################
aqfile = input("Which air quality file would you like to use? ")
df2 = pd.read_csv(aqfile)

PM25 = df2.as_matrix(columns=df2.columns[4:5])
PM1 = df2.as_matrix(columns=df2.columns[3:4])
PM10 = df2.as_matrix(columns=df2.columns[5:])

unix_timeaq = df2.as_matrix(columns=df2.columns[2:3])

aq_used = eval(raw_input("Which data set do you want? PM1, PM25, or PM10? "))


######################################################################################

def find_nearest(array, value):

    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    if np.abs(array[idx]-value) <= 30:
        # print str(value) + "Vs" + str(array[idx])
        return idx
    else:
        return None

#######################################################################################

def make_usable(array1, array):
    i = len(array1) - 1
    while i > 0:
        if np.isnan(array[i]) or np.isnan(array1[i]):
            del array[i]
            del array1[i]
        i = i - 1

#######################################################################################

weatherarr = []
aqarr = []

i = 0

while i < len(aq_used):

    aqarr.append(float(aq_used[i]))

    nearest_time = find_nearest(unix_timeweather, unix_timeaq[i])

    if nearest_time is None:
        weatherarr.append(np.nan)
    else:
        weatherarr.append(float(w_used[nearest_time]))

    i = i+1


# Plot the arrays #####################################################################

make_usable(weatherarr,aqarr)



hoverp = HoverTool(tooltips=[("(x,y)", "($x, $y)")])


p = figure(tools = [hoverp])
correlation = Label(x=50, y=50, x_units='screen', y_units='screen', text="Pearson r and p: "+ str(scipy.stats.pearsonr(weatherarr, aqarr)),render_mode='css',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0)

p.add_layout(correlation)
p.circle(x = weatherarr, y = aqarr, color = "firebrick")

show(p)


Which weather file would you like to use? 'chs_os_weather.csv'
Which data set do you want? temp, humidity, or pressure? temp
Which air quality file would you like to use? 'chs_os_aq.csv'
Which data set do you want? PM1, PM25, or PM10? PM10